
## this is the core analytical file for the cross-sectional debian underproduction analysis 

## Step 0: load libraries, set variables, make some helper functions
library(dplyr)
library(reldist)
library(ggplot2)
library(scales)
library(stringr)
library(tidyr)
library(eulerr)
library(effects)
library(ggeffects)
library(sjPlot)
library(texreg)

setwd("path/to/your/dir")
## yearsOld is package 
load('processed_data/fullDataset.RData') 
figDir = 'path/to/your/figures/'
pubDataDir = 'path/to/your/latex/knitr/datadir'
source('lib-00-utils.R') #loads up helpers for making Rtex docs etc

reverselog_trans <- function(base = exp(1)) {
  trans <- function(x) -log(x, base)
  inv <- function(x) base^(-x)
  trans_new(paste0("reverselog-", format(base)), trans, inv, 
            log_breaks(base = base), 
            domain = c(1e-100, Inf))
}

generateMeanLangAges <- function(df) {
  langList <- colnames(langDF)[-1]
  df$meanLangAge <- NA
  for (i in seq(length(df$package))) {
#    print(i)
    my.totLangs <- 0
    my.langAgeSum <- 0
    for (lang in langList) {
#      print('now examining')
#      print(lang)
#      print('for package')
#      print(df[i,]$package)
      if (is.na(df[i,lang])) {
#        print('skipping NA')
        next #skip all the NAs
      }
      if (df[i,lang] ==0) {
#        print('skipping 0')
        next #skip all the tagged but not using language X 
      }
#      print(df[i,lang])
      my.totLangs = my.totLangs + 1     
      my.langAgeSum <- my.langAgeSum + (df[i,lang] * as.numeric(subset(langAgeDF, langAgeDF$language==lang)$deltaNow))
    }
    if (my.totLangs != 0) { #otherwise leave as NA
      df[i,'meanLangAge'] <- my.langAgeSum / my.totLangs
    }
  }
  return(df)
}

## Step 1: Some cleanup and type fixes. Calculate key variable prop_team
# 
newRow <- langAgeDF[langAgeDF$language == "c++",] ## R doesn't like ++ in colnames so it munges this into c.. in some cases
newRow$language <- "c.."
langAgeDF <- rbind(langAgeDF, newRow)
newRow <- langAgeDF[langAgeDF$language == "c-sharp",] ## R doesn't like - in colnames so it munges this into c.. in some cases
newRow$language <- "c.sharp"
langAgeDF <- rbind(langAgeDF, newRow)

packageDF <- packageDF %>% mutate(maintainerTurnover = case_when(maintainerCount > 1 ~ TRUE,
                                                                 TRUE ~ FALSE))
packageDF <- unique(packageDF)
packageDF$maintainerTurnover <- as.factor(packageDF$maintainerTurnover)
packageDF$yearsOld <- as.numeric(packageDF$yearsOld)

packageDF$prop_team <- packageDF$list_maint / packageDF$uploadCount
summary(packageDF$prop_team)
hist(packageDF$prop_team) #note that it is rather bimodal

langAgeDF$deltaNow <- 2023 - as.numeric(langAgeDF$release.one)

#debugging this function with a test subset
#testDF <- subset(packageDF, packageDF$inst.rank < 100)
#temp <- generateMeanLangAges(testDF)
packageDF <- generateMeanLangAges(packageDF)

## Step 2: Run Models 


## M1: no language or network measures
m1 <- glm(underprod ~ yearsOld + uploaderCount + maintainerTurnover + prop_team, data=packageDF, family="binomial")
summary(m1)

## M2: No language measures
m2 <- glm(underprod ~ yearsOld + uploaderCount + maintainerTurnover + prop_team + Eig.comment + Betweenness.comment, data=packageDF, family="binomial")
summary(m2)

## M3: No network measures
m3 <- glm(underprod ~ yearsOld + meanLangAge + yearsOld*meanLangAge + uploaderCount + maintainerTurnover + prop_team, data=packageDF, family="binomial")
summary(m3)

## M4: all measures

m4 <- glm(underprod ~ yearsOld + meanLangAge + yearsOld*meanLangAge + uploaderCount + maintainerTurnover + prop_team + Eig.comment + Betweenness.comment, data=packageDF, family="binomial")
summary(m4)

### will need these preds for the visuals later
preds <- ggpredict(m4, terms=c("yearsOld [0:30]", "meanLangAge [25, 48]"))
#preds <- data.frame(predict.glm(allFactors2, type="response", se.fit=TRUE))

## save out what the paper needs via remember

## number of aliases
## monster table is all four models, m1, m2, m3, and m4
## m4 coefs for H1, H2, H4, H5, H6, H7, H8 interp

con <- textConnection("monster.texreg", "w") #remembered
sink(con, split=TRUE, type="output")
texreg(list(m1,m2,m3,m4), omit.coef = 'factor', stars=NULL, digits=2,
       custom.model.names=c('M1: no lang/network measures', 'M2: No language measures', 'M3: No network measures', 'M4: Full model'), 
       custom.coef.names=c('(Intercept)', 'Package Age (years)', 'Uploader Count', 'Did maintainer change?', 'Team proportion', 'Eigenvector Centrality', 'Betweenness Centrality', 'Mean Language Age', 'Package Age : Mean Language Age'), 
       use.packages=FALSE, table=FALSE, ci.force = TRUE)
sink()
close(con);rm(con)

m4.coefs <- summary(m4)$coefficients ## for easy reference within the latex document


if (!nosave) {
  r <- list()
  remember(m4.coefs)
  remember(monster.texreg)
  save(r, file=paste0(pubDataDir, "knitr_data.RData"), version=2)
  rm(r)
}


#build the figures for the paper by running visuals.R
